Part 1: Time series exploration and forecasting

energy <- read_csv(here("data", "energy.csv"))
## 
## ── Column specification ────────────────────────────────────────────────────────
## cols(
##   month = col_character(),
##   res_total = col_double(),
##   ind_total = col_double()
## )
energy_ts <- energy %>% 
  mutate(date = tsibble::yearmonth(month)) %>% 
  as_tsibble(key = NULL, index = date)

Raw data graph

ggplot(data = energy_ts, aes(x = date, y = res_total)) +
  geom_line() + 
  labs(y = "Residential energy consumption \n (Trillion BTU)")

Seasonplot

energy_ts %>% 
  gg_season(y = res_total) +
  theme_minimal() +
  labs(x = "month",
       y = "residential energy consumption (trillion BTU)")
## Warning in NextMethod("["): Incompatible methods (">=.Date", ">=.vctrs_vctr")
## for ">="
## Warning in NextMethod("["): Incompatible methods ("<=.Date", "<=.vctrs_vctr")
## for "<="
## Warning in NextMethod("["): Incompatible methods (">=.Date", ">=.vctrs_vctr")
## for ">="
## Warning in NextMethod("["): Incompatible methods ("<=.Date", "<=.vctrs_vctr")
## for "<="

  • The highest residential energy usage is around December / January / February
  • There is a secondary peak around July & August (that’s the repeated secondary peak we see in the original time series graph)
  • We can also see that the prevalence of that second peak has been increasing over the course of the time series: in 1973 (orange) there was hardly any summer peak. In more recent years (blue/magenta) that peak is much more prominent.

Subseries plot

energy_ts %>% gg_subseries(res_total)

  • Our takeaway here is similar: there is clear seasonality (higher values in winter months), with an increasingly evident second peak in June/July/August. This reinforces our takeaways from the raw data and seasonplots.

Decomposition

# Find STL decomposition
dcmp <- energy_ts %>% 
  model(STL(res_total ~ season()))

# View the components
components (dcmp)
## # A dable:           538 x 7 [1M]
## # Key:               .model [1]
## # STL Decomposition: res_total = trend + season_year + remainder
##    .model               date res_total trend season_year remainder season_adjust
##    <chr>               <mth>     <dbl> <dbl>       <dbl>     <dbl>         <dbl>
##  1 STL(res_total ~… 1973 Jan     1932. 1257.       684.      -9.21         1248.
##  2 STL(res_total ~… 1973 Feb     1687. 1253.       465.     -31.3          1222.
##  3 STL(res_total ~… 1973 Mar     1497. 1250.       242.       5.67         1255.
##  4 STL(res_total ~… 1973 Apr     1178. 1246.       -62.9     -5.49         1241.
##  5 STL(res_total ~… 1973 May     1015. 1242.      -256.      28.7          1271.
##  6 STL(res_total ~… 1973 Jun      934. 1238.      -313.       8.89         1247.
##  7 STL(res_total ~… 1973 Jul      981. 1234.      -231.     -22.0          1212.
##  8 STL(res_total ~… 1973 Aug     1019. 1231.      -225.      13.7          1244.
##  9 STL(res_total ~… 1973 Sep      957. 1227.      -314.      43.5          1271.
## 10 STL(res_total ~… 1973 Oct      992. 1224.      -271.      39.4          1263.
## # … with 528 more rows
# Visualize the decomposed components
components(dcmp) %>% autoplot() + 
  theme_minimal() 

Autocorrelation function (ACF)

energy_ts %>% 
  ACF(res_total) %>% 
  autoplot() 

  • We see that observations separated by 12 months are the most highly correlated, reflecting strong seasonality we see in all of our other exploratory visualizations.

Holt-Winters exponential smoothing

# Create the model: 
energy_fit <- energy_ts %>% 
  model(
    ets = ETS(res_total ~ season("M"))
  )

# Forecast using the model 10 years into the future: 
energy_forecast <- energy_fit %>% 
  forecast(h = "10 years")

# Plot just the forecasted values (with 80% and 95% CIs): 
energy_forecast %>% 
  autoplot() 

# Or plot it added to the original data: 

energy_forecast %>% 
  autoplot(energy_ts)

Assessing residuals

# Append the predicted values (and residuals) to original energy data 
energy_predicted <- broom::augment(energy_fit)

# Use View(energy_predicted) to see the resulting data frame
# Plot the actual energy values (res_total), and the predicted values (stored as .fitted) atop them
ggplot(data = energy_predicted) + 
  geom_line(aes(x = date, y = res_total)) + 
  geom_line(aes(x = date, y = .fitted), color = "red")

Residuals should be uncorrelated, centered at 0, and ideally normally distributed.

ggplot(data = energy_predicted, 
       aes(x = .resid)) + 
  geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Part 2: Spatial data wrangling, visualization, and a variogram

A. California county outlines (polygons)

# Read in the CA county shapefile 

ca_counties <- read_sf(here("data","ca_counties","CA_Counties_TIGER2016.shp"))

ca_subset <- ca_counties %>% 
  select(NAME, ALAND) %>% 
  rename(county_name = NAME, land_area = ALAND)

# Check and set the CRS 

ca_subset %>% st_crs() 
## Coordinate Reference System:
##   User input: WGS 84 / Pseudo-Mercator 
##   wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
##     BASEGEOGCRS["WGS 84",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["Popular Visualisation Pseudo-Mercator",
##         METHOD["Popular Visualisation Pseudo Mercator",
##             ID["EPSG",1024]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["False easting",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["World - 85°S to 85°N"],
##         BBOX[-85.06,-180,85.06,180]],
##     ID["EPSG",3857]]
# Plot the California counties using geom_sf()

ggplot(data = ca_subset) + 
  geom_sf(aes(fill = land_area),
          color = "white", 
          size = 0.1) +
  theme_void() + 
  scale_fill_gradientn(colors = c("cyan", "blue", "purple"))

B. Invasive red sesbania records (spatial points)

# Read in the data 
sesbania <- read_sf(here("data", "red_sesbania", "ds80.shp"))

# Check the CRS: 
sesbania %>% st_crs()
## Coordinate Reference System:
##   User input: Custom 
##   wkt:
## PROJCRS["Custom",
##     BASEGEOGCRS["NAD83",
##         DATUM["North American Datum 1983",
##             ELLIPSOID["GRS 1980",6378137,298.257222101,
##                 LENGTHUNIT["metre",1]],
##             ID["EPSG",6269]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["Degree",0.0174532925199433]]],
##     CONVERSION["unnamed",
##         METHOD["Albers Equal Area",
##             ID["EPSG",9822]],
##         PARAMETER["Latitude of false origin",0,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8821]],
##         PARAMETER["Longitude of false origin",-120,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8822]],
##         PARAMETER["Latitude of 1st standard parallel",34,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8823]],
##         PARAMETER["Latitude of 2nd standard parallel",40.5,
##             ANGLEUNIT["Degree",0.0174532925199433],
##             ID["EPSG",8824]],
##         PARAMETER["Easting at false origin",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8826]],
##         PARAMETER["Northing at false origin",-4000000,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8827]]],
##     CS[Cartesian,2],
##         AXIS["(E)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]],
##         AXIS["(N)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1,
##                 ID["EPSG",9001]]]]
# Update the CRS so it matches the CA counties CRS using st_transform()
sesbania <- st_transform(sesbania, 3857)

# Then check it: 
sesbania %>% st_crs()
## Coordinate Reference System:
##   User input: EPSG:3857 
##   wkt:
## PROJCRS["WGS 84 / Pseudo-Mercator",
##     BASEGEOGCRS["WGS 84",
##         DATUM["World Geodetic System 1984",
##             ELLIPSOID["WGS 84",6378137,298.257223563,
##                 LENGTHUNIT["metre",1]]],
##         PRIMEM["Greenwich",0,
##             ANGLEUNIT["degree",0.0174532925199433]],
##         ID["EPSG",4326]],
##     CONVERSION["Popular Visualisation Pseudo-Mercator",
##         METHOD["Popular Visualisation Pseudo Mercator",
##             ID["EPSG",1024]],
##         PARAMETER["Latitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8801]],
##         PARAMETER["Longitude of natural origin",0,
##             ANGLEUNIT["degree",0.0174532925199433],
##             ID["EPSG",8802]],
##         PARAMETER["False easting",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8806]],
##         PARAMETER["False northing",0,
##             LENGTHUNIT["metre",1],
##             ID["EPSG",8807]]],
##     CS[Cartesian,2],
##         AXIS["easting (X)",east,
##             ORDER[1],
##             LENGTHUNIT["metre",1]],
##         AXIS["northing (Y)",north,
##             ORDER[2],
##             LENGTHUNIT["metre",1]],
##     USAGE[
##         SCOPE["unknown"],
##         AREA["World - 85°S to 85°N"],
##         BBOX[-85.06,-180,85.06,180]],
##     ID["EPSG",3857]]
# Plot them together! 
  
ggplot() + 
  geom_sf(data = ca_subset) +
  geom_sf(data = sesbania, size = 1, color = "red")

# Find the count of red sesbania observed locations in this dataset by county 

# Spatial joins 
ca_sesbania <- ca_subset %>% 
  st_join(sesbania) 

sesbania_counts <- ca_sesbania %>% 
  count(county_name)
# Chloropleth using the number of records for red sesbania as the fill color 

ggplot(data = sesbania_counts) + 
  geom_sf(aes(fill = n), color = "white", size = 0.1) +
  scale_fill_gradientn(colors = c("lightgray", "orange", "red")) +
  theme_minimal() +
  labs(fill = "Number of S. punicea records")

# Suset of sesbania point locations only in Solano County
solano_sesbania <- sesbania %>% 
  filter(COUNTY == "Solano") 

# Onlykeep Solano polygon from California County data 
solano <- ca_subset %>% 
  filter(county_name == "Solano")

ggplot()+
  geom_sf(data = solano) +
  geom_sf(data = solano_sesbania)

C. Making an interactive map with {tmap}

# Set the viewing mode to "interactive": 
tmap_mode(mode = "view")
## tmap mode set to interactive viewing
# Then make a map (with the polygon fill color updated by variable 'land_area', updating the color palette to "BuGn"), then add another shape layer for the sesbania records (added as dots): 

tm_shape(ca_subset) + 
  tm_fill("land_area", palette = "BuGn") +
  tm_shape(sesbania) + 
  tm_dots()